In [1]:
# %load database-reader.py
import sqlite3
import pandas as pd
from xml.etree import ElementTree

Origin. 3040 raw match obersvations for English Premier League After merged with team arribute. it reduces to 2393 observations.

Missing Data: 1 from match table. we discard PS, SJ, GB, BS betting company daa + player featuures. focus on the Primary League only. 2 discard additional four lines with empty features


In [2]:
db_location = "soccer/database.sqlite"

conn = sqlite3.connect(db_location)

In [3]:
df_match = pd.read_sql(sql="SELECT id,league_id, season, stage, date, match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal, goal, shoton, shotoff, foulcommit, card, cross, corner, possession, B365H, B365D, B365A, BWH, BWD, BWA, IWH, IWD, IWA, LBH, LBD, LBA, WHH,WHD,WHA,VCH,VCD,VCA FROM Match WHERE league_id=1729 AND date >= '2010-02-22'", con=conn)
df_match=df_match.dropna()
df_match.head()


Out[3]:
id league_id season stage date match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal ... IWA LBH LBD LBA WHH WHD WHA VCH VCD VCA
0 2228 1729 2009/2010 2 2010-03-24 00:00:00 658592 8456 8668 0 2 ... 4.30 1.80 3.30 3.75 1.8 3.30 3.75 1.95 3.40 4.00
1 2239 1729 2009/2010 21 2010-03-16 00:00:00 658936 8528 10252 1 2 ... 2.20 3.30 3.25 2.20 3.4 3.25 2.15 3.60 3.25 2.10
2 2240 1729 2009/2010 21 2010-03-23 00:00:00 658937 8654 8602 1 3 ... 4.60 1.80 3.30 3.75 1.7 3.30 4.33 1.80 3.50 4.75
3 2245 1729 2009/2010 21 2010-03-10 00:00:00 658942 8191 10194 1 1 ... 2.85 2.38 3.20 2.60 2.4 3.10 2.60 2.50 3.25 2.88
4 2246 1729 2009/2010 21 2010-03-09 00:00:00 658943 8472 8559 4 0 ... 3.40 2.10 3.20 3.00 2.0 3.20 3.20 2.10 3.25 3.75

5 rows × 36 columns


In [4]:
frames = list()

for index, row in df_match.iterrows():
    hteam=row.home_team_api_id
    ateam=row.away_team_api_id
    date=row.date ##match date
    hquery="SELECT * FROM team_attributes WHERE team_api_id="+str(hteam)
    aquery="SELECT * FROM team_attributes WHERE team_api_id="+str(ateam)
    df_hteam=pd.read_sql(sql=hquery,con=conn)
    df_ateam=pd.read_sql(sql=aquery,con=conn)
    df_hteam=df_hteam[df_hteam.date<=date]
    df_ateam=df_ateam[df_ateam.date<=date]
    df_hlastest= df_hteam.tail(1)
    df_alastest= df_ateam.tail(1)
    df_row=row.to_frame().T
    ##print df_row
    df_row_home=pd.merge(df_row, df_hlastest, how='outer',left_on='home_team_api_id', right_on='team_api_id', suffixes=('_match', '_home'))
    df_row_home_away=pd.merge(df_row_home,df_alastest, how='outer',left_on='away_team_api_id', right_on='team_api_id', suffixes=('_home', '_away'))
    frames.append(df_row_home_away)

    ##hteam_frames.append(df_hlastest)
    ##ateam_frames.append(df_alastest)

df_match_team = pd.concat(frames).reset_index(drop=True)
#df_ateams = pd.concat(ateam_frames)
df_match_team.head()


Out[4]:
id_match league_id season stage date_match match_api_id home_team_api_id away_team_api_id home_team_goal away_team_goal ... chanceCreationShooting_away chanceCreationShootingClass_away chanceCreationPositioningClass_away defencePressure_away defencePressureClass_away defenceAggression_away defenceAggressionClass_away defenceTeamWidth_away defenceTeamWidthClass_away defenceDefenderLineClass_away
0 2228 1729 2009/2010 2 2010-03-24 00:00:00 658592 8456 8668 0 2 ... 45 Normal Organised 40 Medium 70 Double 40 Normal Cover
1 2239 1729 2009/2010 21 2010-03-16 00:00:00 658936 8528 10252 1 2 ... 50 Normal Free Form 30 Deep 70 Double 30 Narrow Cover
2 2240 1729 2009/2010 21 2010-03-23 00:00:00 658937 8654 8602 1 3 ... 70 Lots Organised 70 High 70 Double 70 Wide Cover
3 2245 1729 2009/2010 21 2010-03-10 00:00:00 658942 8191 10194 1 1 ... 55 Normal Organised 35 Medium 70 Double 35 Normal Cover
4 2246 1729 2009/2010 21 2010-03-09 00:00:00 658943 8472 8559 4 0 ... 45 Normal Organised 35 Medium 70 Double 35 Normal Cover

5 rows × 86 columns


In [5]:
# Replace possession XML with the latest value
for index, row in df_match_team.iterrows():
    tree = ElementTree.fromstring(row['possession'])
    elapsed = -1
    possession = -1
    for value in tree:
        for tag in value:
            if tag.tag == 'elapsed':
                curelapsed = int(tag.text)
            if tag.tag == 'homepos':
                curpossession = tag.text
        if elapsed <= curelapsed:
            elapsed = curelapsed
            possession = curpossession
    df_match_team.loc[index, 'possession'] = possession
df_match_team = df_match_team.rename(columns = {'possession' : 'possession_home'})

In [6]:
def parse_xml(field):
    global df_match_team
    # Replace corner XML with the count
    if field == 'card':
        df_match_team['ycard_home'] = 0
        df_match_team['ycard_away'] = 0
        df_match_team['rcard_home'] = 0
        df_match_team['rcard_away'] = 0
    else:
        df_match_team['%s_home' % field] = 0
        df_match_team['%s_away' % field] = 0
        if field == 'cross':
            df_match_team['throwin_home'] = 0
            df_match_team['throwin_away'] = 0
    for index, row in df_match_team.iterrows():
        tree = ElementTree.fromstring(row[field])
        for value in tree:
            ignore = False
            away = ''
            cardtype = ''
            field_save = field
            for tag in value:
                if tag.tag == 'team':
                    current_team = int(tag.text)
                    if current_team == row['home_team_api_id']:
                        away = 'home'
                    elif current_team == row['away_team_api_id']:
                        away = 'away'
                    else:
                        print (current_team)
                        print (row)
                        assert False
                if (tag.tag == 'card_type' or tag.tag == 'comment') and field == 'card':
                    cardtype = tag.text
                    if cardtype == 'y2':
                        cardtype = 'r'
                if tag.tag == 'type':
                    if tag.text != field:
                        if field == 'cross' and tag.text == 'throwin':
                            field_save = tag.text
                        else:
                            if tag.text != 'corner':
                                print ('%s instead of %s' % (tag.text, field))
                            ignore = True
            if not ignore and (away == 'away' or away == 'home'):
                df_match_team.loc[index, '%s%s_%s' % (cardtype, field_save, away)] += 1
    df_match_team = df_match_team.drop(field, axis=1)
    
parse_xml('corner')
parse_xml('cross')
parse_xml('shoton')
parse_xml('shotoff')
parse_xml('foulcommit')
parse_xml('card')

In [7]:
# Drop IDs and dates
df_match_team=df_match_team.drop(labels = ['date', 'id', 'id_match', 'league_id', 'season', 'date_match', 'match_api_id',
                                           'home_team_api_id', 'away_team_api_id', 'team_fifa_api_id_home', 
                                           'team_api_id_home', 'team_fifa_api_id_away', 'team_api_id_away', 
                                           'id_home', 'date_home', 'goal'], axis = 1)

# Use Classes to fill missing values
replacements = []
for index, row in df_match_team.iterrows():
    for column in df_match_team:
        if pd.isnull(row[column]):
            classColumn = column[:-5] + 'Class' + column[-5:]
            if classColumn in df_match_team:
                replacement = df_match_team[column][df_match_team[classColumn] == row[classColumn]].median()
                replacements.append('In %s: replaced %s with %f' % (column, row[classColumn], replacement))
                df_match_team.loc[index, column] = replacement
for r in set(replacements):
    print (r)

# Drop columns that still have missing values                
df_match_team=df_match_team.dropna(axis = 1)

# Drop Classes what have numerical equvialents
for column in df_match_team:
    classColumn = column[:-5] + 'Class' + column[-5:]
    if classColumn in df_match_team:
        df_match_team = df_match_team.drop(classColumn, axis = 1)


In buildUpPlayDribbling_away: replaced Little with 32.000000
In buildUpPlayDribbling_home: replaced Little with 32.000000

In [8]:
for columnName in df_match_team:
    no_unique = len(df_match_team[columnName].unique())
    if no_unique == 1:
        print ('Dropping %s because it has only one 1 unique value' % columnName)
        df_match_team = df_match_team.drop(columnName, axis = 1)
    if no_unique == 2:
        mode = df_match_team[columnName].mode().values[0]
        df_match_team.loc[:,columnName] = df_match_team[columnName] == mode
        df_match_team = df_match_team.rename(columns={columnName: columnName + '_is' + mode})
        print ('Binarizing %s to %s_is%s' % (columnName, columnName, mode))


Binarizing buildUpPlayPositioningClass_home to buildUpPlayPositioningClass_home_isOrganised
Binarizing chanceCreationPositioningClass_home to chanceCreationPositioningClass_home_isOrganised
Dropping defenceDefenderLineClass_home because it has only one 1 unique value
Binarizing buildUpPlayPositioningClass_away to buildUpPlayPositioningClass_away_isOrganised
Binarizing chanceCreationPositioningClass_away to chanceCreationPositioningClass_away_isOrganised
Dropping defenceDefenderLineClass_away because it has only one 1 unique value

In [9]:
df_match_team.to_csv('dataset.csv', index=None)
df_match_team.dtypes


Out[9]:
stage                                               object
home_team_goal                                      object
away_team_goal                                      object
possession_home                                     object
B365H                                               object
B365D                                               object
B365A                                               object
BWH                                                 object
BWD                                                 object
BWA                                                 object
IWH                                                 object
IWD                                                 object
IWA                                                 object
LBH                                                 object
LBD                                                 object
LBA                                                 object
WHH                                                 object
WHD                                                 object
WHA                                                 object
VCH                                                 object
VCD                                                 object
VCA                                                 object
buildUpPlaySpeed_home                                int64
buildUpPlayDribbling_home                          float64
buildUpPlayPassing_home                              int64
buildUpPlayPositioningClass_home_isOrganised          bool
chanceCreationPassing_home                           int64
chanceCreationCrossing_home                          int64
chanceCreationShooting_home                          int64
chanceCreationPositioningClass_home_isOrganised       bool
defencePressure_home                                 int64
defenceAggression_home                               int64
defenceTeamWidth_home                                int64
buildUpPlaySpeed_away                                int64
buildUpPlayDribbling_away                          float64
buildUpPlayPassing_away                              int64
buildUpPlayPositioningClass_away_isOrganised          bool
chanceCreationPassing_away                           int64
chanceCreationCrossing_away                          int64
chanceCreationShooting_away                          int64
chanceCreationPositioningClass_away_isOrganised       bool
defencePressure_away                                 int64
defenceAggression_away                               int64
defenceTeamWidth_away                                int64
corner_home                                          int64
corner_away                                          int64
cross_home                                           int64
cross_away                                           int64
throwin_home                                         int64
throwin_away                                         int64
shoton_home                                          int64
shoton_away                                          int64
shotoff_home                                         int64
shotoff_away                                         int64
foulcommit_home                                      int64
foulcommit_away                                      int64
ycard_home                                           int64
ycard_away                                           int64
rcard_home                                           int64
rcard_away                                           int64
dtype: object

In [10]:
conn.close()